mirror of
https://github.com/binux/pyspider.git
synced 2024-11-25 16:34:30 +08:00
added https to couchdb + cleanup + added couchdb to docs
This commit is contained in:
parent
afb0afa970
commit
e69f5de584
5
.env
5
.env
@ -1,5 +0,0 @@
|
||||
COUCHDB_USER=user
|
||||
COUCHDB_PASSWORD=password
|
||||
COUCHDB_NAME=couchdb
|
||||
COUCHDB_PORT_5984_TCP_ADDR=couchdb
|
||||
COUCHDB_PORT_5984_TCP_PORT=5984
|
@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**
|
||||
|
||||
- Write script in Python
|
||||
- Powerful WebUI with script editor, task monitor, project manager and result viewer
|
||||
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
|
||||
- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
|
||||
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
|
||||
- Task priority, retry, periodical, recrawl by age, etc...
|
||||
- Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...
|
||||
|
@ -1,6 +1,7 @@
|
||||
version: "3.7"
|
||||
|
||||
# docker build ./ -t pyspider:latest
|
||||
# replace /path/to/dir/ to point to config_example.json
|
||||
|
||||
services:
|
||||
rabbitmq:
|
||||
@ -16,8 +17,12 @@ services:
|
||||
- pyspider
|
||||
ports:
|
||||
- "5984:5984"
|
||||
env_file: .env
|
||||
|
||||
environment:
|
||||
- COUCHDB_NAME=couchdb
|
||||
- COUCHDB_USER=user
|
||||
- COUCHDB_PASSWORD=password
|
||||
- COUCHDB_HTTPS=true
|
||||
# OR we can replace couchdb with mysql
|
||||
#mysql:
|
||||
# image: mysql:latest
|
||||
# container_name: mysql
|
||||
@ -27,15 +32,13 @@ services:
|
||||
# - MYSQL_ALLOW_EMPTY_PASSWORD=yes
|
||||
# networks:
|
||||
# - pyspider
|
||||
# env_file: .env
|
||||
phantomjs:
|
||||
image: pyspider:latest
|
||||
container_name: phantomjs
|
||||
networks:
|
||||
- pyspider
|
||||
env_file: .env
|
||||
volumes:
|
||||
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
|
||||
- /path/to/dir/config_example.json:/opt/pyspider/config.json
|
||||
command: -c config.json phantomjs
|
||||
depends_on:
|
||||
- couchdb
|
||||
@ -46,9 +49,8 @@ services:
|
||||
container_name: result
|
||||
networks:
|
||||
- pyspider
|
||||
env_file: .env
|
||||
volumes:
|
||||
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
|
||||
- /path/to/dir/config_example.json:/opt/pyspider/config.json
|
||||
command: -c config.json result_worker
|
||||
depends_on:
|
||||
- couchdb
|
||||
@ -59,9 +61,8 @@ services:
|
||||
image: pyspider:latest
|
||||
networks:
|
||||
- pyspider
|
||||
env_file: .env
|
||||
volumes:
|
||||
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
|
||||
- /path/to/dir/config_example.json:/opt/pyspider/config.json
|
||||
command: -c config.json processor
|
||||
depends_on:
|
||||
- couchdb
|
||||
@ -72,9 +73,8 @@ services:
|
||||
container_name: fetcher
|
||||
networks:
|
||||
- pyspider
|
||||
env_file: .env
|
||||
volumes:
|
||||
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
|
||||
- /path/to/dir/config_example.json:/opt/pyspider/config.json
|
||||
command : -c config.json fetcher
|
||||
depends_on:
|
||||
- couchdb
|
||||
@ -85,9 +85,8 @@ services:
|
||||
container_name: scheduler
|
||||
networks:
|
||||
- pyspider
|
||||
env_file: .env
|
||||
volumes:
|
||||
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
|
||||
- /path/to/dir/config_example.json:/opt/pyspider/config.json
|
||||
command: -c config.json scheduler
|
||||
depends_on:
|
||||
- couchdb
|
||||
@ -100,9 +99,8 @@ services:
|
||||
- "5050:5000"
|
||||
networks:
|
||||
- pyspider
|
||||
env_file: .env
|
||||
volumes:
|
||||
- /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
|
||||
- /path/to/dir/config_example.json:/opt/pyspider/config.json
|
||||
environment:
|
||||
- SCHEDULER_PORT_23333_TCP_ADDR=scheduler
|
||||
command: -c config.json webui
|
||||
|
@ -72,6 +72,8 @@ sqlite:
|
||||
mongodb:
|
||||
mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
|
||||
more: http://docs.mongodb.org/manual/reference/connection-string/
|
||||
couchdb:
|
||||
couchdb+type://[username:password@]host[:port]
|
||||
sqlalchemy:
|
||||
sqlalchemy+postgresql+type://user:passwd@host:port/database
|
||||
sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database
|
||||
|
@ -8,7 +8,7 @@ To deploy pyspider in product environment, running component in each process and
|
||||
Installation
|
||||
------------
|
||||
|
||||
To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.
|
||||
To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.
|
||||
|
||||
And you need a message queue service to connect the components together. You can use [RabbitMQ](http://www.rabbitmq.com/) or [Redis](http://redis.io/) as message queue.
|
||||
|
||||
@ -63,6 +63,8 @@ sqlite:
|
||||
mongodb:
|
||||
mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
|
||||
more: http://docs.mongodb.org/manual/reference/connection-string/
|
||||
couchdb:
|
||||
couchdb+type://[username:password@]host[:port][?options]]
|
||||
sqlalchemy:
|
||||
sqlalchemy+postgresql+type://user:passwd@host:port/database
|
||||
sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database
|
||||
|
@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**
|
||||
|
||||
- Write script in Python
|
||||
- Powerful WebUI with script editor, task monitor, project manager and result viewer
|
||||
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
|
||||
- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
|
||||
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
|
||||
- Task priority, retry, periodical, recrawl by age, etc...
|
||||
- Distributed architecture, Crawl Javascript pages, Python 2&3, etc...
|
||||
|
@ -34,7 +34,7 @@ def connect_database(url):
|
||||
elasticsearch:
|
||||
elasticsearch+type://host:port/?index=pyspider
|
||||
couchdb:
|
||||
couchdb+type://host[:port]
|
||||
couchdb+type://[username:password@]host[:port]
|
||||
local:
|
||||
local+projectdb://filepath,filepath
|
||||
|
||||
@ -207,14 +207,29 @@ def _connect_elasticsearch(parsed, dbtype):
|
||||
|
||||
|
||||
def _connect_couchdb(parsed, dbtype, url):
|
||||
# TODO: Add https + auth as parameters
|
||||
url = "http://" + parsed.netloc + "/"
|
||||
if os.environ.get('COUCHDB_HTTPS'):
|
||||
url = "https://" + parsed.netloc + "/"
|
||||
else:
|
||||
url = "http://" + parsed.netloc + "/"
|
||||
params = {}
|
||||
params['username'] = os.environ.get('COUCHDB_USER') or 'user'
|
||||
params['password'] = os.environ.get('COUCHDB_PASSWORD') or 'password'
|
||||
|
||||
username = None
|
||||
password = None
|
||||
if '@' in parsed.netloc:
|
||||
# netloc looks like: 'user:pass@couchdb:999'
|
||||
url = parsed.netloc[parsed.netloc.find("@")+1:]
|
||||
# extract the username and password
|
||||
username = parsed.netloc[:parsed.netloc.find(":")]
|
||||
password = parsed.netloc[parsed.netloc.find(":")+1:parsed.netloc.find("@")]
|
||||
|
||||
# default to env, then url, then hard coded
|
||||
params['username'] = os.environ.get('COUCHDB_USER') or username or 'user'
|
||||
params['password'] = os.environ.get('COUCHDB_PASSWORD') or password or 'password'
|
||||
|
||||
# create required CouchDB databases if not already present
|
||||
requests.put(url+"_users")
|
||||
requests.put(url+"_replicator")
|
||||
|
||||
# create the admin user
|
||||
# NOTE: Over docker, this user is already created when COUCHDB_USER and COUCHDB_PASSWORD are set
|
||||
requests.put(url+'_node/_local/_config/admins/'+ params['username'],
|
||||
|
@ -114,8 +114,10 @@ def cli(ctx, **kwargs):
|
||||
elif os.environ.get('COUCHDB_NAME'):
|
||||
kwargs[db] = utils.Get(lambda db=db: connect_database(
|
||||
'couchdb+%s://%s:%s/%s' % (
|
||||
db, os.environ['COUCHDB_PORT_5984_TCP_ADDR'],
|
||||
os.environ['COUCHDB_PORT_5984_TCP_PORT'], db)))
|
||||
db,
|
||||
os.environ['COUCHDB_PORT_5984_TCP_ADDR'] or 'couchdb',
|
||||
os.environ['COUCHDB_PORT_5984_TCP_PORT'] or '5984',
|
||||
db)))
|
||||
elif ctx.invoked_subcommand == 'bench':
|
||||
if kwargs['data_path'] == './data':
|
||||
kwargs['data_path'] += '/bench'
|
||||
|
Loading…
Reference in New Issue
Block a user